Sys.Date()
## [1] "2020-12-18"

Authors: Hanna Klimczak, Kamil Pluciński

Libraries used

library(plotly)
library(knitr)
library(kableExtra)
library(caret)

Providing data reproductibility

To ensure that running the notebook will always return the same output, we set seed to 0.

set.seed(0)

Reading data from file

data <- read.csv("Life_Expectancy_Data.csv")
kable(head(data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
Country Year Status Life.expectancy Adult.Mortality infant.deaths Alcohol percentage.expenditure Hepatitis.B Measles BMI under.five.deaths Polio Total.expenditure Diphtheria HIV.AIDS GDP Population thinness..1.19.years thinness.5.9.years Income.composition.of.resources Schooling
Afghanistan 2015 Developing 65.0 263 62 0.01 71.279624 65 1154 19.1 83 6 8.16 65 0.1 584.25921 33736494 17.2 17.3 0.479 10.1
Afghanistan 2014 Developing 59.9 271 64 0.01 73.523582 62 492 18.6 86 58 8.18 62 0.1 612.69651 327582 17.5 17.5 0.476 10.0
Afghanistan 2013 Developing 59.9 268 66 0.01 73.219243 64 430 18.1 89 62 8.13 64 0.1 631.74498 31731688 17.7 17.7 0.470 9.9
Afghanistan 2012 Developing 59.5 272 69 0.01 78.184215 67 2787 17.6 93 67 8.52 67 0.1 669.95900 3696958 17.9 18.0 0.463 9.8
Afghanistan 2011 Developing 59.2 275 71 0.01 7.097109 68 3013 17.2 97 68 7.87 68 0.1 63.53723 2978599 18.2 18.2 0.454 9.5
Afghanistan 2010 Developing 58.8 279 74 0.01 79.679367 66 1989 16.7 102 66 9.20 66 0.1 553.32894 2883167 18.4 18.4 0.448 9.2

Dataset summary and basic statistics

nrow(data)
## [1] 2938
summary(data)
##    Country               Year         Status          Life.expectancy
##  Length:2938        Min.   :2000   Length:2938        Min.   :36.30  
##  Class :character   1st Qu.:2004   Class :character   1st Qu.:63.10  
##  Mode  :character   Median :2008   Mode  :character   Median :72.10  
##                     Mean   :2008                      Mean   :69.22  
##                     3rd Qu.:2012                      3rd Qu.:75.70  
##                     Max.   :2015                      Max.   :89.00  
##                                                       NA's   :10     
##  Adult.Mortality infant.deaths       Alcohol        percentage.expenditure
##  Min.   :  1.0   Min.   :   0.0   Min.   : 0.0100   Min.   :    0.000     
##  1st Qu.: 74.0   1st Qu.:   0.0   1st Qu.: 0.8775   1st Qu.:    4.685     
##  Median :144.0   Median :   3.0   Median : 3.7550   Median :   64.913     
##  Mean   :164.8   Mean   :  30.3   Mean   : 4.6029   Mean   :  738.251     
##  3rd Qu.:228.0   3rd Qu.:  22.0   3rd Qu.: 7.7025   3rd Qu.:  441.534     
##  Max.   :723.0   Max.   :1800.0   Max.   :17.8700   Max.   :19479.912     
##  NA's   :10                       NA's   :194                             
##   Hepatitis.B       Measles              BMI        under.five.deaths
##  Min.   : 1.00   Min.   :     0.0   Min.   : 1.00   Min.   :   0.00  
##  1st Qu.:77.00   1st Qu.:     0.0   1st Qu.:19.30   1st Qu.:   0.00  
##  Median :92.00   Median :    17.0   Median :43.50   Median :   4.00  
##  Mean   :80.94   Mean   :  2419.6   Mean   :38.32   Mean   :  42.04  
##  3rd Qu.:97.00   3rd Qu.:   360.2   3rd Qu.:56.20   3rd Qu.:  28.00  
##  Max.   :99.00   Max.   :212183.0   Max.   :87.30   Max.   :2500.00  
##  NA's   :553                        NA's   :34                       
##      Polio       Total.expenditure   Diphtheria       HIV.AIDS     
##  Min.   : 3.00   Min.   : 0.370    Min.   : 2.00   Min.   : 0.100  
##  1st Qu.:78.00   1st Qu.: 4.260    1st Qu.:78.00   1st Qu.: 0.100  
##  Median :93.00   Median : 5.755    Median :93.00   Median : 0.100  
##  Mean   :82.55   Mean   : 5.938    Mean   :82.32   Mean   : 1.742  
##  3rd Qu.:97.00   3rd Qu.: 7.492    3rd Qu.:97.00   3rd Qu.: 0.800  
##  Max.   :99.00   Max.   :17.600    Max.   :99.00   Max.   :50.600  
##  NA's   :19      NA's   :226       NA's   :19                      
##       GDP              Population        thinness..1.19.years
##  Min.   :     1.68   Min.   :3.400e+01   Min.   : 0.10       
##  1st Qu.:   463.94   1st Qu.:1.958e+05   1st Qu.: 1.60       
##  Median :  1766.95   Median :1.387e+06   Median : 3.30       
##  Mean   :  7483.16   Mean   :1.275e+07   Mean   : 4.84       
##  3rd Qu.:  5910.81   3rd Qu.:7.420e+06   3rd Qu.: 7.20       
##  Max.   :119172.74   Max.   :1.294e+09   Max.   :27.70       
##  NA's   :448         NA's   :652         NA's   :34          
##  thinness.5.9.years Income.composition.of.resources   Schooling    
##  Min.   : 0.10      Min.   :0.0000                  Min.   : 0.00  
##  1st Qu.: 1.50      1st Qu.:0.4930                  1st Qu.:10.10  
##  Median : 3.30      Median :0.6770                  Median :12.30  
##  Mean   : 4.87      Mean   :0.6276                  Mean   :11.99  
##  3rd Qu.: 7.20      3rd Qu.:0.7790                  3rd Qu.:14.30  
##  Max.   :28.60      Max.   :0.9480                  Max.   :20.70  
##  NA's   :34         NA's   :167                     NA's   :163

Dealing with missing values

As we can see from the summary above, there are some missing values in the dataset. Due to the fact that life.expectancy is the most important attribute for our analysis, we have decided to remove all rows where life.expectancy is NA.

data_new <- data[complete.cases(data[ , 'Life.expectancy']),]

After this operation, we still encounter missing values in the following columns: “Alcohol”, “Hepatitis.B”, “BMI”, “Polio”, “Total.expenditure”, “Diphtheria”, “GDP”, “Population”, “thinness..1.19.years”, “thinness.5.9.years”, “Income.composition.of.resources”, “Schooling”. We will fill these NA values with median for each column.

na_columns <- list("Alcohol", "Hepatitis.B", "BMI", "Polio", "Total.expenditure", "Diphtheria", "GDP", "Population", "thinness..1.19.years", "thinness.5.9.years", "Income.composition.of.resources", "Schooling")

for (col in na_columns){
    m <- median(data_new[ , col], na.rm=TRUE)
    print(col)
    print(m)
    data_new[ , col][is.na(data_new[ , col])] <- m
}
## [1] "Alcohol"
## [1] 3.77
## [1] "Hepatitis.B"
## [1] 92
## [1] "BMI"
## [1] 43.35
## [1] "Polio"
## [1] 93
## [1] "Total.expenditure"
## [1] 5.75
## [1] "Diphtheria"
## [1] 93
## [1] "GDP"
## [1] 1764.974
## [1] "Population"
## [1] 1391757
## [1] "thinness..1.19.years"
## [1] 3.3
## [1] "thinness.5.9.years"
## [1] 3.4
## [1] "Income.composition.of.resources"
## [1] 0.677
## [1] "Schooling"
## [1] 12.3
summary(data_new)
##    Country               Year         Status          Life.expectancy
##  Length:2928        Min.   :2000   Length:2928        Min.   :36.30  
##  Class :character   1st Qu.:2004   Class :character   1st Qu.:63.10  
##  Mode  :character   Median :2008   Mode  :character   Median :72.10  
##                     Mean   :2008                      Mean   :69.22  
##                     3rd Qu.:2011                      3rd Qu.:75.70  
##                     Max.   :2015                      Max.   :89.00  
##  Adult.Mortality infant.deaths        Alcohol       percentage.expenditure
##  Min.   :  1.0   Min.   :   0.00   Min.   : 0.010   Min.   :    0.000     
##  1st Qu.: 74.0   1st Qu.:   0.00   1st Qu.: 1.107   1st Qu.:    4.854     
##  Median :144.0   Median :   3.00   Median : 3.770   Median :   65.611     
##  Mean   :164.8   Mean   :  30.41   Mean   : 4.559   Mean   :  740.321     
##  3rd Qu.:228.0   3rd Qu.:  22.00   3rd Qu.: 7.400   3rd Qu.:  442.614     
##  Max.   :723.0   Max.   :1800.00   Max.   :17.870   Max.   :19479.912     
##   Hepatitis.B       Measles              BMI        under.five.deaths
##  Min.   : 1.00   Min.   :     0.0   Min.   : 1.00   Min.   :   0.00  
##  1st Qu.:82.00   1st Qu.:     0.0   1st Qu.:19.40   1st Qu.:   0.00  
##  Median :92.00   Median :    17.0   Median :43.35   Median :   4.00  
##  Mean   :83.05   Mean   :  2427.9   Mean   :38.29   Mean   :  42.18  
##  3rd Qu.:96.00   3rd Qu.:   362.2   3rd Qu.:56.10   3rd Qu.:  28.00  
##  Max.   :99.00   Max.   :212183.0   Max.   :77.60   Max.   :2500.00  
##      Polio       Total.expenditure   Diphtheria       HIV.AIDS     
##  Min.   : 3.00   Min.   : 0.370    Min.   : 2.00   Min.   : 0.100  
##  1st Qu.:78.00   1st Qu.: 4.370    1st Qu.:78.00   1st Qu.: 0.100  
##  Median :93.00   Median : 5.750    Median :93.00   Median : 0.100  
##  Mean   :82.62   Mean   : 5.916    Mean   :82.39   Mean   : 1.748  
##  3rd Qu.:97.00   3rd Qu.: 7.330    3rd Qu.:97.00   3rd Qu.: 0.800  
##  Max.   :99.00   Max.   :17.600    Max.   :99.00   Max.   :50.600  
##       GDP              Population        thinness..1.19.years
##  Min.   :     1.68   Min.   :3.400e+01   Min.   : 0.100      
##  1st Qu.:   578.80   1st Qu.:4.181e+05   1st Qu.: 1.600      
##  Median :  1764.97   Median :1.392e+06   Median : 3.300      
##  Mean   :  6627.39   Mean   :1.026e+07   Mean   : 4.834      
##  3rd Qu.:  4793.63   3rd Qu.:4.593e+06   3rd Qu.: 7.100      
##  Max.   :119172.74   Max.   :1.294e+09   Max.   :27.700      
##  thinness.5.9.years Income.composition.of.resources   Schooling    
##  Min.   : 0.100     Min.   :0.0000                  Min.   : 0.00  
##  1st Qu.: 1.600     1st Qu.:0.5040                  1st Qu.:10.30  
##  Median : 3.400     Median :0.6770                  Median :12.30  
##  Mean   : 4.865     Mean   :0.6301                  Mean   :12.02  
##  3rd Qu.: 7.200     3rd Qu.:0.7730                  3rd Qu.:14.10  
##  Max.   :28.600     Max.   :0.9480                  Max.   :20.70

As we can see, we have successfully dealt with missing values.

In-depth analysis of attributes (i.e. value distribution)

names(data_new)
##  [1] "Country"                         "Year"                           
##  [3] "Status"                          "Life.expectancy"                
##  [5] "Adult.Mortality"                 "infant.deaths"                  
##  [7] "Alcohol"                         "percentage.expenditure"         
##  [9] "Hepatitis.B"                     "Measles"                        
## [11] "BMI"                             "under.five.deaths"              
## [13] "Polio"                           "Total.expenditure"              
## [15] "Diphtheria"                      "HIV.AIDS"                       
## [17] "GDP"                             "Population"                     
## [19] "thinness..1.19.years"            "thinness.5.9.years"             
## [21] "Income.composition.of.resources" "Schooling"
quantitive_cols = c("Year", "Life.expectancy", "Adult.Mortality", "infant.deaths", "Alcohol", "percentage.expenditure", "Hepatitis.B", "Measles", "BMI", "under.five.deaths", "Polio", "Total.expenditure", "Diphtheria","HIV.AIDS","GDP","Population", "thinness..1.19.years","thinness.5.9.years",     "Income.composition.of.resources", "Schooling")
l <- htmltools::tagList()

i <- 1
for (col in quantitive_cols){
  l[[i]] <- plot_ly(y = data_new[, col], type = "box",name=col, quartilemethod="exclusive")
  i <- i + 1
}

l
p1 <- plot_ly(data_new, x = ~Status) %>%
  add_histogram()

p1

Corelation between variables (graphic presentation)

correlation_data <- data_new[quantitive_cols]

fig <- plot_ly(
    x = quantitive_cols,
    y = quantitive_cols,
    z = cor(correlation_data), type = "heatmap"
)

fig

The above chart gives us important information about the correlation between variables. As we can see, there is almost perfect correlation between thinness.1.19.years and thinness.5.9.years. There also appears to be a very strong correlation between GDP and percentage.expenditure (0.9), as well as infant.deaths and under.five.deaths (0.99). Naturally, we can also see a strong negative correlation between Adult.Mortality and Life.expectancy (-0.69). Schooling and life expectancy seem to be slightly correlated as well (0.71).

For our prediction, we will need to drop some of the features that are highly correlated. We will drop thinness.5.9.years, percentage.expenditure and infant.deaths, as their correlation to our decision variable is weaker than features correlated to them.

Interavtive plot for average life duration per country with respect to year

fig <- plot_ly(data, x = data_new[, 'Year'], y = data_new[, 'Life.expectancy'], name = ~data_new[, 'Country'], type = 'scatter', mode = 'lines', color=~data_new[, 'Country']) 
fig <- fig %>% layout(legend = list(orientation = 'h'))


fig
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

# Data preparation

regression_data <- data_new[, !names(data_new) %in% c("Country", "thinness.5.9.years", "percentage.expenditure", "infant.deaths")]


kable(head(regression_data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
Year Status Life.expectancy Adult.Mortality Alcohol Hepatitis.B Measles BMI under.five.deaths Polio Total.expenditure Diphtheria HIV.AIDS GDP Population thinness..1.19.years Income.composition.of.resources Schooling
2015 Developing 65.0 263 0.01 65 1154 19.1 83 6 8.16 65 0.1 584.25921 33736494 17.2 0.479 10.1
2014 Developing 59.9 271 0.01 62 492 18.6 86 58 8.18 62 0.1 612.69651 327582 17.5 0.476 10.0
2013 Developing 59.9 268 0.01 64 430 18.1 89 62 8.13 64 0.1 631.74498 31731688 17.7 0.470 9.9
2012 Developing 59.5 272 0.01 67 2787 17.6 93 67 8.52 67 0.1 669.95900 3696958 17.9 0.463 9.8
2011 Developing 59.2 275 0.01 68 3013 17.2 97 68 7.87 68 0.1 63.53723 2978599 18.2 0.454 9.5
2010 Developing 58.8 279 0.01 66 1989 16.7 102 66 9.20 66 0.1 553.32894 2883167 18.4 0.448 9.2

Regressor for average life duration estimation

Train test valid split

trainval_partition <- 
    createDataPartition(
        y = regression_data$Life.expectancy,
        p = .8,
        list = FALSE)

trainval_data <- regression_data[ trainval_partition,]
test_data  <- regression_data[-trainval_partition,]

train_partition <- 
    createDataPartition(
        y = trainval_data$Life.expectancy,
        p = .8,
        list = FALSE)

train_data <- trainval_data[ train_partition,]
val_data  <- trainval_data[-train_partition,]
control <- trainControl(
    method = "repeatedcv",
    number = 10,
    repeats = 5)
linear <- train(Life.expectancy ~ .,
               data = train_data,
               trControl = control,
               method = "lm")
linear
## Linear Regression 
## 
## 1878 samples
##   17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 1690, 1690, 1690, 1690, 1690, 1690, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   4.232244  0.8023078  3.161511
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Model parameter tuning and accuracy calculation using R2 and RMSE measures

lasso <- train(Life.expectancy ~ .,
               data = train_data,
               trControl = control,
               method = "lasso")
lasso
## The lasso 
## 
## 1878 samples
##   17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 1690, 1690, 1689, 1690, 1688, 1691, ... 
## Resampling results across tuning parameters:
## 
##   fraction  RMSE      Rsquared   MAE     
##   0.1       8.521102  0.6982996  6.929972
##   0.5       5.459212  0.7575138  4.110708
##   0.9       4.246008  0.8012071  3.155828
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.9.
ridge <- train(Life.expectancy ~ .,
               data = train_data,
               trControl = control,
               method = "ridge")
ridge
## Ridge Regression 
## 
## 1878 samples
##   17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 1689, 1691, 1691, 1689, 1689, 1690, ... 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE      Rsquared   MAE     
##   0e+00   4.227943  0.8031544  3.162105
##   1e-04   4.227943  0.8031550  3.162129
##   1e-01   4.275968  0.8029778  3.231954
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 1e-04.

Attribute importance analysis for the best model found

ggplot(varImp(lasso))

What attribute contributes the most to longer or shorter life duration?